import re

def main():
    handle = open('/Users/btemperton/Dropbox/PhD/results/Bioinformatics/GOS/all.site.abundance.data.txt', 'r', 1)
    genes = []
    sites = []
    for line in handle:
        match = re.match('^(\S+)\t(GS\S+)\t(.*)\t(\d+)', line)
        '''Lets make our list of sites and genes'''
        if match:
            gene = match.group(1)
            site = match.group(2)
            species = match.group(3)
            count = match.group(4)
            if gene not in genes:
                genes.append(gene)
            if site not in sites:
                sites.append(site)
            pass

    print("The total number of sites is %d, the total number of genes is %d" % (len(genes), len(sites)))
    handle.close()

    for gene in genes:
        handle = open('/Users/btemperton/Dropbox/PhD/results/Bioinformatics/GOS/all.site.abundance.data.txt', 'r', 1)
        for line in handle:
            match = re.match('^%s\t(GS\S+)\t(.*)\t(\d+)' % (gene), line)
            if match:
                pass
        handle.close()
    calculateSimpson()
    calculateShannon()


def calculateSimpson():
    pass

def calculateShannon():
    pass


if __name__ == "__main__":
    main()